Read in source data.
library(tidyverse)
library(mrcommons)
library(mrdrivers)
library(magrittr)
# reed in source data
foodEmi <- readSource("EDGARfood", subtype="foodSystemSector")
foodBal <- calcOutput("FAOmassbalance", aggregate = F)
foodBal <- time_interpolate(foodBal, seq(1990,2018) )
Process source data
# aggregate food products to kcr kli kds k
sets <- read.csv(system.file("extdata", "magpiesets.csv", package = "magpiesets"))
sets %>% select(all,kcr,kli,ksd,k) -> mapping
foodBal %>% toolAggregate(rel= slice(mapping, 1:44),from="all",to="kcr+kli+ksd+k" ,
dim = 3.1, partrel =T) %>%
.[,,"",invert=T] -> foodBal_p
# subset the data that will be used for the emission factors
foodBal_p <- foodBal_p[,,c("wm","dm","nr")][,,c("production","food", "milling" ,
"refining", "extracting","fermentation",
"distilling", "waste", "domestic_supply",
"households")]
# aggregate the processing items to processing cateory
map2 <- tibble(source= c("production","food", "milling" ,"refining",
"extracting","fermentation", "distilling",
"waste", "domestic_supply", "households") ,
tar = c("production","processing", "processing" ,
"processing", "processing","processing",
"processing", "waste","domestic_supply",
"households"))
foodBal_p2<- toolAggregate(foodBal_p, map2, from="source", to="tar",dim=3.2)
# pivot the objects to wide format
pivot_wider(as.data.frame( foodBal_p2), id_cols = c("Year", "Region"), names_from = starts_with("Data") , values_from = "Value") ->datBal
pivot_wider(as.data.frame( foodEmi), id_cols = c("Year", "Region"), names_from = starts_with("Data") , values_from = "Value") ->datEmi
data<- left_join(datEmi, datBal)
Add possible drivers/predictors to the data.
########## add drivers to the data
# add finer resolved processing items
# pivot_wider(as.data.frame( foodBal_p), id_cols = c("Year", "Region"), names_from = starts_with("Data") , values_from = "Value") ->datBal_fine
# data <- left_join(data,datBal_fine)
### add extra infromation from foodBal as drivers
# add finer resolved processing and production items
foodBal[,,c("food","milling","refining","extracting" ,"distilling") ][,,"dm"] %>% dimSums(.,dim=3.2) %>% collapseDim() %>% as.data.frame() %>%
pivot_wider(names_from = Data1, values_from = "Value") -> driv_procc
foodBal[,,c("production") ][,,"dm"] %>% collapseDim() %>% as.data.frame() %>%
pivot_wider(names_from = Data1, values_from = "Value") -> driv_prod
driv_procc %>% rename_with( .cols = !(1:3) , .fn = ~ paste0("driv_proc_", .x) ) -> driv_procc
driv_prod %>% rename_with( .cols = !(1:3) , .fn = ~ paste0("driv_prod_", .x) ) -> driv_prod
data<- left_join(data, driv_procc )
data<- left_join(data, driv_prod )
#add export as driver
exp <- dimSums(foodBal[,,"export"][,,"wm"], dim=3)
data<- left_join( data, select(as.data.frame( exp) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_exp = Value)
### add drivers from other data sources
#read data
gdp <- calcOutput("GDPPast", aggregate = F)
pop <- calcOutput("PopulationPast", aggregate = F)
affl <- collapseDim(gdp)/collapseDim(pop)
urb <- calcOutput("UrbanPast", aggregate = F)
dS <- calcOutput("DevelopmentState", aggregate = F) # needs to be interpolated
dS <- time_interpolate(dS, 1990:2018)
govI <- calcOutput("GovernanceIndicator", aggregate = F) # only from 1995
govI <- time_interpolate(govI, 1990:2018)
#add data
data<- left_join( data, select(as.data.frame( affl) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_aff = Value)
data<- left_join( data, select(as.data.frame( pop) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_pop = Value)
data<- left_join( data, select(as.data.frame( gdp) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_gdp = Value)
data<- left_join( data, select(as.data.frame( urb) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_urb = Value)
data<- left_join( data, select(as.data.frame( govI[,,"SSP1"]) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_govI = Value)
data<- left_join( data, select(as.data.frame( dS[,,"SSP1"]) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_dS = Value)
## add some infromative data about regions and country names
regmap <- toolGetMapping("regionmappingH12.csv")
data<- left_join(data, regmap %>% rename(Region=CountryCode) ,by="Region") %>%
rename( info_countrynames = X, driv_largReg = RegionCode)
## remove some NA or zero rows
#sort out country that are completely zero in fooBal
data %>% group_by(Region) %>% summarise(across(starts_with("k"), sum)) %>% rowwise %>%
mutate(sum= sum(across(starts_with("k") )) ) %>% filter(sum==0) %>% pull(Region) -> sortout # zeros
data %<>% filter(! Region %in% sortout)
#remove emission columns that are completely zero
data %<>% select(!(contains("F-gases" )&!contains("Packaging")&!contains("Retail") ))
#remove households_dm which is a completly zero data column
# data %>% select(contains("households_dm")) %>%
# summarise_each(function(x)sum(x, na.rm = T)) %>% t()
data %<>% select(!(contains("households_dm") ))
# remove vestigous cell columns
data %<>% select(-Cell)
Emission factors assume a linear relationship between the emissions from a given source and a related (economic) activity. While the linear relation does not need to be the same for all countries and times, a good starting point to choose which emission to link to which activity might still be the global correlation coefficients.
### corellation plots
data %>% select(!Year&!Region &!starts_with("driv_")&!starts_with("inf") )-> datac
datac %>% drop_na() %>% cor() %>%
.[1:26,27:ncol(datac)] %>% t -> tem
tem[order(str_extract(row.names(tem), "_.*_") ) , ] %>% corrplot::corrplot( col.lim = c(-0.1,1), method = "number")
The correlation structure is less clear than one could hope for. For
the production emissions it seems common-sensical to choose a production
quantity as a denominator. “k_production_dm” seems to be a good first
idea due to the high correlations.
For the processing emissions there is no single item with high
correlations. “k_processing_dm” is chosen for N2O , “kli_processing_dm”
for CO2 and “kcr_processing_wm” for CH4.
As said the emission factors can vary for different times and regions. We are interested in finding the underlying drivers in order to predict emissions for different economic scenarios.
A simple linear regression model for the emission factors is
\[ emission_{g,c,y}/ m_{c,y} =a+b_{1}* d_{1,c,y}+b_2* d_{2,c,y} + b_3* d_{3,c,y}+ \cdots + \eta \] with \[\eta \sim N(0,\epsilon )\] g is the GHG, c is the country, y is the year, a is the intercept, \(b_i\) are the slopes, \(d_{i,c,y}\) are the drivers in the data and \(\eta\) is the normally distributed error.
Since we are interested in predicting the emissions and not the emission factor in the end it makes sense to work with the equivalent formulation
\[ emission_{g,c,y} =a* m_{c,y}+ b_1* d_{1,c,y}* m_{c,y}+b_2* d_{2,c,y}*m_{c,y} + b_3* d_{3,c,y}*m_{c,y}+ \cdots + \gamma \] with \[\eta \sim N(0,\epsilon m_{c,y} )\]
It should be mentioned that a standard assumption of this regression model is violated in our case: The observations \(emission_{g,c,y}\) \(m_{c,y}\) \(d_{i,c,y}\) for different values of c and y are not independent. Hence the results, especially the significance of influence of drivers, need to be treated with caution. A way of avoiding this assumption, would be the use of mixed effect models. Still to get a first idea of the drivers the standard model is used.
The following linear models where builded by successive elimination of independent variables. Further olsrr a tool for automatic selection of independent variables was used.
The following function is helful for deciding which variable to eliminate.
elimHelper <- function(lR) {
f=NULL
cN <- colnames(lR)
for (n in cN[3:length(cN)]){
lR2 <- lR %>% select( -n)
model2 <- lm( dep ~ k_production_nr +.:k_production_nr+0 , data = lR2 )
r2 <- summary(model2)$r.squared
add <- setNames( r2,n)
f <- c(f, add)
}
model <- lm( dep ~ k_production_nr +.:k_production_nr+0 , data = lR)
r<-summary(model)$coefficients[,4]
print("p-value")
print(sort(r))
print("r-change")
print(summary(model)$r.squared)
print(sort(f))
}
Build the model and print out some statistics.
#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA
#### N2O production emissions
lR <- data %>% mutate(dep=GWP_100_N2O_Production)-> q
q %>% select(dep,k_production_nr,driv_prod_oilpalm,driv_prod_pasture,driv_prod_res_nonfibrous, driv_pop, driv_dS, driv_aff)->lR
# get nonzero processing items
data %>% select(starts_with("driv_pr")) %>% summarise(across(where(is.numeric), sum )) %>% unlist() -> dd
dd[dd!=0] %>% names() -> nonzero_driv
# model <- lm( dep ~ k_production_nr + (driv_prod_oilpalm):k_production_nr +
# (driv_prod_oilpalm):k_production_nr+
# (driv_prod_pasture):k_production_nr+
# (driv_prod_res_nonfibrous):k_production_nr+
# driv_pop+
# driv_aff+
# driv_rice:k_production_nr +0 , data = lR )
model2 <- lm( dep ~ k_production_nr +k_production_nr:driv_prod_pasture+ k_production_nr:driv_prod_res_nonfibrous +0 , data = lR )
#summary(model)
#lR %>% colnames() %>% dput
#k<- olsrr::ols_step_all_possible(model)
summary(model2)
##
## Call:
## lm(formula = dep ~ k_production_nr + k_production_nr:driv_prod_pasture +
## k_production_nr:driv_prod_res_nonfibrous + 0, data = lR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37111 -418 -12 338 37317
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## k_production_nr 1.101e+04 3.612e+01 304.91 <2e-16
## k_production_nr:driv_prod_pasture -5.998e+00 9.351e-02 -64.15 <2e-16
## k_production_nr:driv_prod_res_nonfibrous 1.673e+02 2.241e+00 74.64 <2e-16
##
## k_production_nr ***
## k_production_nr:driv_prod_pasture ***
## k_production_nr:driv_prod_res_nonfibrous ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3692 on 5420 degrees of freedom
## (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.9845, Adjusted R-squared: 0.9845
## F-statistic: 1.147e+05 on 3 and 5420 DF, p-value: < 2.2e-16
### driver: rice cultivation
### literature : rice cultivation, nitrogen fertilisation (manure, sythetic N)
Plot predictions vs datapoints.
p<-ggplot(
data #%>% filter(Year=="2000" )
, aes(x=1.099e+04*k_production_nr
-5.977e+00 * k_production_nr*driv_prod_pasture
+ k_production_nr*driv_prod_res_nonfibrous *1.676e+02,
y= GWP_100_N2O_Production, label = Region, Year= Year ,
#color = driv_prod_rice_pro/(k_production_dm),
country= info_countrynames
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA
#### CO2 production emissions, k_households_nr works better as denominator, why??
lR <- data %>% mutate(dep=GWP_100_CO2_Production)-> q
q %>% select(dep,k_production_nr,starts_with("driv_prod"), driv_aff, driv_dS, driv_exp, driv_govI, driv_pop, k_households_nr)->lRPre
lRPre %>% select(-c(driv_prod_ethanol, driv_prod_oilpalm,driv_prod_oils,driv_prod_others ,
driv_prod_sunflower,driv_prod_foddr, driv_prod_betr,driv_prod_begr,
driv_prod_scp, driv_prod_cottn_pro, driv_prod_trce, driv_prod_tece,
driv_prod_livst_egg, driv_prod_res_nonfibrous, driv_prod_woodfuel,driv_prod_alcohol,
driv_prod_brans, driv_exp,driv_prod_res_cereals,driv_prod_res_fibrous,
driv_prod_livst_rum, driv_prod_soybean, driv_prod_oilcakes,
driv_prod_molasses, driv_prod_livst_chick ,driv_prod_sugar,
driv_prod_cassav_sp, driv_prod_fibres, driv_prod_rapeseed, driv_prod_livst_pig,
driv_aff, driv_prod_fish, driv_prod_sugr_beet, driv_prod_sugr_cane,
driv_prod_puls_pro, driv_govI,driv_dS, driv_prod_rice_pro,
)) -> lR
#elimHelper(lR)
#model <- lm( dep ~ k_production_nr +.:k_production_nr+0 , data = lR )
model2 <- lm( dep ~ k_production_nr+ k_production_nr:driv_prod_pasture+ k_production_nr:driv_pop+0 , data = lR )
#model3 <- lm( dep ~ k_households_nr+ k_households_nr:. +0 , data = lRPre )
summary(model2)
##
## Call:
## lm(formula = dep ~ k_production_nr + k_production_nr:driv_prod_pasture +
## k_production_nr:driv_pop + 0, data = lR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35344 -566 -13 533 83912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## k_production_nr 4385.55214 55.78662 78.61 <2e-16 ***
## k_production_nr:driv_prod_pasture -4.18502 0.13878 -30.16 <2e-16 ***
## k_production_nr:driv_pop 5.06529 0.07384 68.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5716 on 5420 degrees of freedom
## (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.9176, Adjusted R-squared: 0.9176
## F-statistic: 2.013e+04 on 3 and 5420 DF, p-value: < 2.2e-16
#k<-olsrr::ols_step_all_possible(model3)
### drivers: no clear driver visible, in SSA urbanization makes a difference
### literature: mechanization of agriculture
p<-ggplot(
data #%>% filter(Year=="2000" )
, aes(x=4384.08708*k_production_nr
-4.18417 *k_production_nr*driv_prod_pasture +
k_production_nr*driv_pop * 5.06605
, y= GWP_100_CO2_Production, label = Region,
Year= Year,
country = info_countrynames # ,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA
#### CH4 production emissions
dataM <- select(data,-starts_with("info"))
dataM %>% mutate(dep=GWP_100_CH4_Production) %>% select(dep,k_production_nr,starts_with("driv_prod"), driv_aff, driv_dS, driv_exp, driv_govI, driv_pop)->lRPre
# these variables where eliminated since they did not contribute to the dependent variable
lRPre %>% select(-c(driv_prod_res_fibrous,driv_prod_oilpalm, driv_prod_rapeseed,driv_prod_res_nonfibrous,
driv_prod_ethanol, driv_prod_livst_chick, driv_aff , driv_prod_puls_pro , driv_prod_scp , driv_prod_betr, driv_prod_betr , driv_prod_begr , driv_prod_wood , driv_prod_tece, driv_prod_fish, driv_dS , driv_prod_foddr , driv_prod_others , driv_prod_sugr_beet, driv_prod_sugar , driv_prod_livst_milk , driv_prod_alcohol , driv_prod_molasses, driv_exp , driv_prod_distillers_grain , driv_prod_maiz, driv_prod_potato , driv_prod_cassav_sp , driv_prod_oils , driv_prod_oilcakes , driv_prod_sunflower, driv_prod_cottn_pro, driv_prod_groundnut , driv_prod_livst_egg , driv_prod_fibres, driv_prod_woodfuel , driv_prod_trce )) -> lR
#elimHelper(lR)
#model <- lm( dep ~ k_production_nr + .:k_production_nr+0 , data = lR)
#k<-olsrr::ols_step_all_possible(model)
# resulting model
model <- lm( dep ~ k_production_nr +
k_production_nr:driv_prod_brans +
k_production_nr:driv_prod_res_cereals +0 , data = lR)
summary(model)
##
## Call:
## lm(formula = dep ~ k_production_nr + k_production_nr:driv_prod_brans +
## k_production_nr:driv_prod_res_cereals + 0, data = lR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -97295 -636 5 1105 91959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## k_production_nr 24641.624 105.671 233.2 <2e-16 ***
## k_production_nr:driv_prod_brans -1255.008 12.312 -101.9 <2e-16 ***
## k_production_nr:driv_prod_res_cereals 179.961 1.505 119.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12290 on 5420 degrees of freedom
## (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.9723, Adjusted R-squared: 0.9723
## F-statistic: 6.348e+04 on 3 and 5420 DF, p-value: < 2.2e-16
### old model
# p<- ggplot(
# data #%>% filter( Year=="2000")
# , aes(x=k_production_nr, y= GWP_100_CH4_Production, label = Region, Year= Year ))+
# geom_point(aes(color = ((driv_rice+driv_milk*2)/driv_aff)^0.6 ) )+ #geom_text(aes(color =
# geom_smooth(method='lm', formula= y~x, aes(group=1))
#
# plotly::ggplotly(p)
### drivers: rice, milk prodcutin, different world regions
###: literature: rice, enteric fermatation
p<- ggplot(
data #%>% filter( Year=="2000")
, aes(x= k_production_nr * 24571.241+
k_production_nr*driv_prod_brans * -1251.170 +
k_production_nr*driv_prod_res_cereals * 179.799
, y= GWP_100_CH4_Production,
label = Region,
Year= Year,
rum= driv_prod_livst_rum,
country = info_countrynames ))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
elimHelper_proc <- function(lR) {
f=NULL
cN <- colnames(lR)
for (n in cN[3:length(cN)]){
lR2 <- lR %>% select( -n)
model2 <- lm( dep ~ kcr_processing_wm +.:kcr_processing_wm+0 , data = lR2 )
r2 <- summary(model2)$r.squared
add <- setNames( r2,n)
f <- c(f, add)
}
model <- lm( dep ~ kcr_processing_wm +.:kcr_processing_wm+0 , data = lR)
r<-summary(model)$coefficients[,4]
print("p-value")
print(sort(r))
print("r-change")
print(summary(model)$r.squared)
print(sort(f))
}
#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA
#### N2O processing emissions
# best denominator: k_processing_dm , alternatively kli_processing_dm
# data$driv_rice
lR <- data %>% mutate(dep=GWP_100_N2O_Processing)%>% select(dep,k_processing_dm ,starts_with("driv_"), k_processing_nr, kcr_processing_nr)
#model <- lm( dep ~ k_processing_dm+ .:k_processing_dm +0, data = lR )
#summary(model)
model2 <- lm( dep ~k_processing_dm + k_processing_dm:driv_pop + k_processing_dm:k_processing_nr+ k_processing_dm:kcr_processing_nr+0, data=lR)
summary(model2)
##
## Call:
## lm(formula = dep ~ k_processing_dm + k_processing_dm:driv_pop +
## k_processing_dm:k_processing_nr + k_processing_dm:kcr_processing_nr +
## 0, data = lR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -499.15 -1.21 0.55 6.59 280.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## k_processing_dm 7.663e+00 3.151e-02 243.18 <2e-16 ***
## k_processing_dm:driv_pop 3.321e-03 3.708e-05 89.54 <2e-16 ***
## k_processing_dm:k_processing_nr 2.231e+00 2.303e-02 96.88 <2e-16 ***
## k_processing_dm:kcr_processing_nr -3.308e+00 3.443e-02 -96.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.24 on 5419 degrees of freedom
## (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.9903, Adjusted R-squared: 0.9903
## F-statistic: 1.382e+05 on 4 and 5419 DF, p-value: < 2.2e-16
#k<- olsrr::ols_step_all_possible(model)
#view(k)
# p<- ggplot(
# data #%>% filter(Year=="2000" )
# , aes(x=7.6618298 *k_processing_dm, y= GWP_100_N2O_Processing, Region=Region
# ))+
# geom_point(
# aes( )
# )+
# #geom_text(aes(label = Region))+
# geom_smooth(method='lm', formula= y~x, aes(group=1))# befor regression
### driver_for_kli_processing_dm: driv_pop, k_processing_nr, kcr_processing_nr; Or: brans and oilpalm
# maybe its about what products are procesed? k_processing_nr k_processing_dm interaction,
# more part of crop processing seems to result in less NO2 emissions. Higher pop results in higher NO2 emissions
p<- ggplot(
data #%>% filter(Year=="2000" )
, aes(x=7.6618298 *k_processing_dm +
k_processing_dm*driv_pop * 0.0033206 +
k_processing_dm*k_processing_nr *2.2303566 +
k_processing_dm*kcr_processing_nr* -3.3071144
, y= GWP_100_N2O_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA
#### CO2 processing emissions
#data$driv_exp
data %>% mutate(dep= GWP_100_CO2_Processing) %>% select(dep, kli_processing_nr ,driv_proc_brans, driv_proc_sugr_beet ) -> lR
model <- lm( dep ~ kli_processing_nr+ driv_proc_brans:kli_processing_nr + kli_processing_nr:driv_proc_sugr_beet+ +0, data = lR )
summary(model)
##
## Call:
## lm(formula = dep ~ kli_processing_nr + driv_proc_brans:kli_processing_nr +
## kli_processing_nr:driv_proc_sugr_beet + +0, data = lR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36724 -76 -3 218 42737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## kli_processing_nr 14616.68 407.25 35.89 <2e-16 ***
## kli_processing_nr:driv_proc_brans 1651.53 19.31 85.55 <2e-16 ***
## kli_processing_nr:driv_proc_sugr_beet 5739.90 75.15 76.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2421 on 5420 degrees of freedom
## (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.9534, Adjusted R-squared: 0.9533
## F-statistic: 3.693e+04 on 3 and 5420 DF, p-value: < 2.2e-16
#k<- olsrr::ols_step_all_possible(model)
### driver: sugr and brans processing, taking squroot improves slitly the r^2
p2<- ggplot(
data #%>% filter(Region %in% c("BRA", "IDN", "IND") )
, aes( x= kli_processing_nr * 14614.71 +
kli_processing_nr*driv_proc_brans *1651.60 +
kli_processing_nr*driv_proc_sugr_beet *5740.25 ,
y= GWP_100_CO2_Processing, label = Region, Year= Year ,
country = info_countrynames
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))
For this emission a slightly different method was tried. The model is fitted to a random subset of the data and then tested on the other part.
#### CH4 processing emissions
# main driver: kcr_processing_wm
set.seed(1001)
data %>% sample_n(2000) -> data_fit # choose model fitting sample
data_test <- data %>% anti_join(data_fit)
lR <- data_fit %>% mutate(dep=GWP_100_CH4_Processing)%>% select(dep,kcr_processing_wm, k_processing_wm, kli_processing_wm, driv_aff, driv_pop, driv_gdp, driv_dS, driv_urb, k_processing_nr, starts_with("driv_Proc") )
lR_f <- lR %>% select(-c( driv_proc_sugar , driv_aff , driv_urb,
driv_proc_livst_chick, driv_proc_groundnut, driv_proc_potato, driv_proc_puls_pro , driv_proc_tece, driv_proc_cassav_sp , driv_proc_maiz, driv_proc_oilpalm , driv_proc_rapeseed,
driv_proc_alcohol , driv_proc_molasses , driv_proc_woodfuel ,driv_proc_wood , driv_proc_scp , driv_proc_res_nonfibrous , driv_proc_res_fibrous , driv_proc_res_cereals , driv_proc_pasture , driv_proc_betr , driv_proc_begr , driv_proc_foddr, driv_proc_fibres , driv_proc_oilcakes , driv_proc_ethanol , driv_proc_distillers_grain , driv_proc_fish , driv_proc_livst_rum , driv_proc_livst_pig ,driv_gdp, driv_proc_cottn_pro , driv_proc_sunflower, driv_proc_trce , driv_proc_livst_egg , driv_proc_rice_pro , driv_proc_oils , driv_proc_brans
))
#elimHelper_proc(lR_f)
model <- lm( dep ~ kcr_processing_wm +kcr_processing_wm:driv_pop+ kcr_processing_wm:driv_proc_others+0 , data = lR_f)
summary(model)
##
## Call:
## lm(formula = dep ~ kcr_processing_wm + kcr_processing_wm:driv_pop +
## kcr_processing_wm:driv_proc_others + 0, data = lR_f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8655.4 -213.2 -41.0 3.2 8231.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## kcr_processing_wm 37.7156677 0.3359690 112.26 <2e-16 ***
## kcr_processing_wm:driv_pop -0.0289276 0.0005299 -54.59 <2e-16 ***
## kcr_processing_wm:driv_proc_others 0.2461490 0.0080560 30.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 837.9 on 1963 degrees of freedom
## (34 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.9006, Adjusted R-squared: 0.9004
## F-statistic: 5927 on 3 and 1963 DF, p-value: < 2.2e-16
#k<-olsrr::ols_step_all_possible(model)
# literature: wastewater
Fitting data.
p2<- ggplot(
data_fit #%>% filter(Region %in% c("BRA", "IDN", "IND") )
, aes( x= (kcr_processing_wm * 37.4412095 +
kcr_processing_wm*driv_pop * -0.0289276 +
kcr_processing_wm*driv_proc_others * 0.2461490),
y= GWP_100_CH4_Processing, label = Region, Year= Year ,
country = info_countrynames
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))
Test data.
p2<- ggplot(
data_test #%>% filter(Region %in% c("BRA", "IDN", "IND") )
, aes( x= kcr_processing_wm * 37.4412095 +
kcr_processing_wm*driv_pop * -0.0289276 +
kcr_processing_wm*driv_proc_others * 0.2461490,
y= GWP_100_CH4_Processing, label = Region, Year= Year ,
country = info_countrynames
#color = -4.212e+02 * driv_dS + driv_aff * 5.742e-03 + driv_govI * 1.019e+03
))+
geom_point(
aes( )
)+
#geom_text(aes(label = Region))+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))